This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
library(broom)
library(splines)
library(purrr)
### define the fine spatial grids for each GP, the spatial resolution
### will be between -2 and 2 for all dimensions for simplicity
num_fine_int <- 80
fine_grid_list <- list(
x1 = seq(-2, 2, length.out = num_fine_int+1)
)
fine_grid_list
## $x1
## [1] -2.00 -1.95 -1.90 -1.85 -1.80 -1.75 -1.70 -1.65 -1.60 -1.55 -1.50 -1.45
## [13] -1.40 -1.35 -1.30 -1.25 -1.20 -1.15 -1.10 -1.05 -1.00 -0.95 -0.90 -0.85
## [25] -0.80 -0.75 -0.70 -0.65 -0.60 -0.55 -0.50 -0.45 -0.40 -0.35 -0.30 -0.25
## [37] -0.20 -0.15 -0.10 -0.05 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35
## [49] 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95
## [61] 1.00 1.05 1.10 1.15 1.20 1.25 1.30 1.35 1.40 1.45 1.50 1.55
## [73] 1.60 1.65 1.70 1.75 1.80 1.85 1.90 1.95 2.00
fine_grid <- fine_grid_list %>% as.data.frame() %>% tibble::as_tibble()
fine_basis_mat <- model.matrix(~ ns(x1, df = 8) -1, data = fine_grid)
colnames(fine_basis_mat)
## [1] "ns(x1, df = 8)1" "ns(x1, df = 8)2" "ns(x1, df = 8)3" "ns(x1, df = 8)4"
## [5] "ns(x1, df = 8)5" "ns(x1, df = 8)6" "ns(x1, df = 8)7" "ns(x1, df = 8)8"
fine_basis_mat %>% as.data.frame() %>% tbl_df() %>%
mutate(x1 = fine_grid$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
set.seed(4545)
beta_true <- rnorm(n = ncol(fine_basis_mat), mean = 0, sd = 3)
beta_true
## [1] -4.7034162 1.7847890 3.8856362 1.0085650 -4.8070812 6.3484287 -1.0373638
## [8] -0.4311326
mean_trend_true <- as.numeric(fine_basis_mat %*% as.matrix(beta_true))
mean_trend_true
## [1] 0.00000000 -0.21806277 -0.43646885 -0.65556154 -0.87568414 -1.09717997
## [7] -1.32039232 -1.54566450 -1.77333981 -2.00376157 -2.23727307 -2.47260126
## [13] -2.70200764 -2.91613732 -3.10563544 -3.26114712 -3.37331751 -3.43279171
## [19] -3.43021488 -3.35623212 -3.20148858 -2.96042168 -2.64263805 -2.26153660
## [25] -1.83051627 -1.36297597 -0.87231463 -0.37193118 0.12477546 0.60440638
## [31] 1.05356264 1.46098004 1.82393331 2.14183187 2.41408518 2.64010266
## [37] 2.81929376 2.95106792 3.03483457 3.07000317 3.05598313 2.99262224
## [43] 2.88152150 2.72472026 2.52425787 2.28217366 2.00050699 1.68129720
## [49] 1.32658363 0.93840561 0.51880251 0.07268969 -0.38351331 -0.83051092
## [55] -1.24900757 -1.61970769 -1.92331571 -2.14053606 -2.25207317 -2.23863146
## [61] -2.08091538 -1.76856062 -1.32692797 -0.79030951 -0.19299729 0.43071661
## [67] 1.04654014 1.62018121 2.11734777 2.50374776 2.74508909 2.81578424
## [73] 2.72506375 2.49086269 2.13111613 1.66375914 1.10672680 0.47795417
## [79] -0.20462367 -0.92307165 -1.65945470
fine_grid_df <- fine_grid %>% mutate(mean_trend = mean_trend_true)
fine_grid_df
## # A tibble: 81 x 2
## x1 mean_trend
## <dbl> <dbl>
## 1 -2 0
## 2 -1.95 -0.218
## 3 -1.9 -0.436
## 4 -1.85 -0.656
## 5 -1.8 -0.876
## 6 -1.75 -1.10
## 7 -1.7 -1.32
## 8 -1.65 -1.55
## 9 -1.6 -1.77
## 10 -1.55 -2.00
## # … with 71 more rows
fine_grid_df %>%
ggplot(mapping = aes(x = x1, y = mean_trend)) +
geom_line(size = 1.15) +
theme_bw()
### generate the noisy observations
sd_noise_1d <- 0.4 # noise
set.seed(65646)
fine_df_1d <- fine_grid_df %>%
mutate(y = rnorm(n = n(), mean = mean_trend, sd = sd_noise_1d))
fine_df_1d
## # A tibble: 81 x 3
## x1 mean_trend y
## <dbl> <dbl> <dbl>
## 1 -2 0 0.548
## 2 -1.95 -0.218 -0.674
## 3 -1.9 -0.436 -0.521
## 4 -1.85 -0.656 -0.573
## 5 -1.8 -0.876 -0.698
## 6 -1.75 -1.10 -1.58
## 7 -1.7 -1.32 -0.987
## 8 -1.65 -1.55 -1.06
## 9 -1.6 -1.77 -2.49
## 10 -1.55 -2.00 -1.90
## # … with 71 more rows
### work with a coarse grid instead of all of the points in the fine grid
num_coarse_int <- 40
coarse_grid_list <- list(
x1 = seq(-2, 2, length.out = num_coarse_int+1)
)
coarse_grid <- coarse_grid_list %>% as.data.frame() %>% tibble::as_tibble()
coarse_grid
## # A tibble: 41 x 1
## x1
## <dbl>
## 1 -2
## 2 -1.9
## 3 -1.8
## 4 -1.7
## 5 -1.6
## 6 -1.5
## 7 -1.4
## 8 -1.30
## 9 -1.2
## 10 -1.1
## # … with 31 more rows
train_df_1d <- fine_df_1d %>%
right_join(coarse_grid, by = c("x1"))
train_df_1d
## # A tibble: 41 x 3
## x1 mean_trend y
## <dbl> <dbl> <dbl>
## 1 -2 0 0.548
## 2 -1.9 -0.436 -0.521
## 3 -1.8 -0.876 -0.698
## 4 -1.7 -1.32 -0.987
## 5 -1.6 -1.77 -2.49
## 6 -1.5 -2.24 -2.32
## 7 -1.4 -2.70 -3.03
## 8 -1.30 -3.11 -3.45
## 9 -1.2 -3.37 -3.70
## 10 -1.1 -3.43 -2.76
## # … with 31 more rows
train_df_1d %>%
ggplot(mapping = aes(x = x1, y = mean_trend)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
ThinPlateSplines_fit_func_No_Reg <- function(df_1, data){
model <- lm(y ~ smoothCon(s(x1, k=df_1, bs = 'ts'), data = data)[[1]]$X - 1, data = data)
metrics <- glance(model) %>% mutate(df_1 = df_1)
return(metrics)
}
df_grid<- list(x1 = c(5:18)) %>% as.data.frame() %>% tbl_df()
df_grid
## # A tibble: 14 x 1
## x1
## <int>
## 1 5
## 2 6
## 3 7
## 4 8
## 5 9
## 6 10
## 7 11
## 8 12
## 9 13
## 10 14
## 11 15
## 12 16
## 13 17
## 14 18
ThinPlateSplines_comp_fit <- map_dfr(df_grid$x1, ThinPlateSplines_fit_func_No_Reg, data = train_df_1d)
ThinPlateSplines_comp_fit
## # A tibble: 14 x 13
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.605 0.551 1.41 11.1 1.69e- 6 5 -69.7 151. 162.
## 2 0.755 0.713 1.13 18.0 2.19e- 9 6 -60.0 134. 146.
## 3 0.907 0.888 0.706 47.5 1.03e-15 7 -40.1 96.1 110.
## 4 0.938 0.923 0.587 62.1 1.12e-17 8 -31.9 81.8 97.2
## 5 0.970 0.962 0.411 117. 6.92e-22 9 -16.6 53.2 70.4
## 6 0.977 0.970 0.369 131. 1.68e-22 10 -11.5 45.0 63.9
## 7 0.977 0.969 0.374 116. 1.92e-21 11 -11.5 46.9 67.5
## 8 0.977 0.968 0.380 103. 2.06e-20 12 -11.4 48.8 71.1
## 9 0.977 0.967 0.383 93.4 1.67e-19 13 -11.0 50.1 74.1
## 10 0.980 0.969 0.372 92.5 4.47e-19 14 -9.03 48.1 73.8
## 11 0.981 0.971 0.362 91.1 1.39e-18 15 -7.19 46.4 73.8
## 12 0.982 0.970 0.366 83.6 1.10e-17 16 -6.83 47.7 76.8
## 13 0.983 0.970 0.364 79.8 5.44e-17 17 -5.73 47.5 78.3
## 14 0.983 0.970 0.366 74.2 3.66e-16 18 -5.17 48.3 80.9
## # … with 4 more variables: deviance <dbl>, df.residual <int>, nobs <int>,
## # df_1 <int>
df_best_ThinPlateSplines_AIC <- ThinPlateSplines_comp_fit %>% filter(AIC == min(AIC)) %>% select(df_1)
df_best_ThinPlateSplines_AIC
## # A tibble: 1 x 1
## df_1
## <int>
## 1 10
ThinPlateSplines_smooth_Best_AIC = smoothCon(s(x1, k=df_best_ThinPlateSplines_AIC$df_1, bs = 'ts'), data = train_df_1d)[[1]]
ThinPlateSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 10
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -1.58 1.65 1.82 1.95 -1.95 2.07 -2.02 -2.14 1 -1.69
## 2 -1.50 1.63 1.73 1.92 -1.86 2.05 -1.92 -2.11 1 -1.61
## 3 -1.42 1.61 1.63 1.89 -1.75 2.01 -1.80 -2.06 1 -1.52
## 4 -1.34 1.59 1.54 1.86 -1.63 1.94 -1.65 -1.95 1 -1.44
## 5 -1.26 1.56 1.44 1.80 -1.50 1.83 -1.47 -1.78 1 -1.35
## 6 -1.18 1.53 1.34 1.72 -1.34 1.68 -1.25 -1.56 1 -1.27
## 7 -1.11 1.49 1.23 1.62 -1.16 1.49 -1.02 -1.33 1 -1.18
## 8 -1.04 1.45 1.11 1.49 -0.972 1.27 -0.798 -1.12 1 -1.10
## 9 -0.966 1.40 0.990 1.33 -0.775 1.06 -0.617 -0.978 1 -1.01
## 10 -0.899 1.34 0.861 1.15 -0.587 0.868 -0.493 -0.905 1 -0.930
## # … with 31 more rows
ThinPlateSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble() %>%
mutate(x1 = train_df_1d$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
ThinPlateSplines_train_df <- ThinPlateSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
ThinPlateSplines_smooth_Best_AIC_fit <- lm(y ~ . - 1, data = ThinPlateSplines_train_df)
ThinPlateSplines_smooth_Best_AIC_fit %>% summary()
##
## Call:
## lm(formula = y ~ . - 1, data = ThinPlateSplines_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57779 -0.27556 -0.03859 0.20834 0.77819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## V1 15.3868 3.1347 4.909 2.79e-05 ***
## V2 -6.4735 0.5236 -12.363 1.61e-13 ***
## V3 -1.3574 0.9152 -1.483 0.14812
## V4 -5.1303 0.3352 -15.304 5.35e-16 ***
## V5 -7.9014 0.5062 -15.609 3.11e-16 ***
## V6 -2.8075 0.4251 -6.604 2.23e-07 ***
## V7 -3.7503 0.5657 -6.630 2.07e-07 ***
## V8 1.5624 0.5277 2.961 0.00584 **
## V9 10.5557 1.6547 6.379 4.19e-07 ***
## V10 -13.8512 1.2411 -11.161 2.19e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3685 on 31 degrees of freedom
## Multiple R-squared: 0.9769, Adjusted R-squared: 0.9695
## F-statistic: 131.3 on 10 and 31 DF, p-value: < 2.2e-16
coefplot::coefplot(ThinPlateSplines_smooth_Best_AIC_fit) + theme_bw()
ThinPlateSplines_smooth_Best_AIC_fit %>% glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.977 0.970 0.369 131. 1.68e-22 10 -11.5 45.0 63.9
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
ThinPlateSplines_test_basis_df <- PredictMat(ThinPlateSplines_smooth_Best_AIC, fine_df_1d) %>% as.data.frame() %>% tbl_df()
ThinPlateSplines_test_pred <- predict(ThinPlateSplines_smooth_Best_AIC_fit, newdata = ThinPlateSplines_test_basis_df)
fine_df_1d %>%
mutate(y_pred = ThinPlateSplines_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
ThinPlateSplines_smooth_overfit = smoothCon(s(x1, k=30, bs = 'ts'), data = train_df_1d)[[1]]
ThinPlateSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 30
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -1.61 0.775 1.78 1.04 -1.92 1.16 1.99 1.22 2.03 1.26 -2.06 -1.29
## 2 -1.53 0.796 1.69 1.06 -1.82 1.18 1.89 1.24 1.92 1.28 -1.95 -1.30
## 3 -1.45 0.818 1.60 1.08 -1.72 1.19 1.77 1.23 1.79 1.25 -1.79 -1.24
## 4 -1.37 0.837 1.51 1.09 -1.60 1.16 1.62 1.17 1.60 1.14 -1.56 -1.09
## 5 -1.30 0.854 1.41 1.07 -1.47 1.10 1.43 1.04 1.36 0.958 -1.29 -0.882
## 6 -1.22 0.866 1.31 1.04 -1.31 0.992 1.22 0.869 1.11 0.761 -1.04 -0.716
## 7 -1.14 0.872 1.20 0.978 -1.13 0.844 0.984 0.680 0.881 0.608 -0.881 -0.650
## 8 -1.07 0.870 1.08 0.888 -0.940 0.671 0.765 0.517 0.721 0.538 -0.810 -0.663
## 9 -0.999 0.859 0.959 0.771 -0.744 0.496 0.586 0.413 0.639 0.543 -0.784 -0.672
## 10 -0.931 0.837 0.831 0.632 -0.557 0.343 0.463 0.378 0.616 0.571 -0.735 -0.601
## # … with 31 more rows, and 18 more variables: V13 <dbl>, V14 <dbl>, V15 <dbl>,
## # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
## # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
## # V28 <dbl>, V29 <dbl>, V30 <dbl>
ThinPlateSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble() %>%
mutate(x1 = train_df_1d$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
ThinPlateSplines__overfit_train_df <- ThinPlateSplines_smooth_overfit$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
ThinPlateSplines_smooth_overfit_fit <- lm(y ~ . - 1, data = ThinPlateSplines__overfit_train_df)
ThinPlateSplines_smooth_overfit_fit %>% summary()
##
## Call:
## lm(formula = y ~ . - 1, data = ThinPlateSplines__overfit_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39433 -0.16298 0.00417 0.14536 0.42438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## V1 19.25274 17.22939 1.117 0.287620
## V2 -9.10312 2.03978 -4.463 0.000958 ***
## V3 -3.13689 4.22109 -0.743 0.472966
## V4 -5.71051 0.47887 -11.925 1.24e-07 ***
## V5 -7.50099 1.00034 -7.498 1.20e-05 ***
## V6 -3.04956 0.44389 -6.870 2.69e-05 ***
## V7 3.57004 0.67840 5.262 0.000267 ***
## V8 -1.69355 0.52718 -3.212 0.008269 **
## V9 -0.26929 0.68976 -0.390 0.703689
## V10 -0.24306 0.62498 -0.389 0.704769
## V11 0.59326 0.76645 0.774 0.455226
## V12 -1.15558 0.72297 -1.598 0.138265
## V13 -1.25455 0.85634 -1.465 0.170905
## V14 0.60009 0.81719 0.734 0.478103
## V15 -1.10174 0.94608 -1.165 0.268846
## V16 0.76193 0.90525 0.842 0.417887
## V17 0.34213 1.03024 0.332 0.746069
## V18 0.28867 0.98476 0.293 0.774871
## V19 0.87147 1.10502 0.789 0.446980
## V20 0.77925 1.05287 0.740 0.474726
## V21 1.71454 1.16652 1.470 0.169630
## V22 -1.81793 1.10588 -1.644 0.128450
## V23 0.93086 1.21009 0.769 0.457947
## V24 -1.97274 1.13907 -1.732 0.111202
## V25 -1.05248 1.22995 -0.856 0.410408
## V26 -0.01229 1.14646 -0.011 0.991639
## V27 1.34061 1.21901 1.100 0.294925
## V28 -0.82839 1.12086 -0.739 0.475340
## V29 6.43951 9.01735 0.714 0.490027
## V30 -23.17077 5.59364 -4.142 0.001638 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3638 on 11 degrees of freedom
## Multiple R-squared: 0.992, Adjusted R-squared: 0.9703
## F-statistic: 45.62 on 30 and 11 DF, p-value: 5.639e-08
coefplot::coefplot(ThinPlateSplines_smooth_overfit_fit) + theme_bw()
ThinPlateSplines_smooth_overfit_fit %>% glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.992 0.970 0.364 45.6 5.64e-8 30 10.3 41.5 94.6
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
ThinPlateSplines_overfit_test_basis_df <- PredictMat(ThinPlateSplines_smooth_overfit, fine_df_1d) %>% as.data.frame() %>% tbl_df()
ThinPlateSplines_overfit_test_pred <- predict(ThinPlateSplines_smooth_overfit_fit, newdata = ThinPlateSplines_overfit_test_basis_df)
fine_df_1d %>%
mutate(y_pred = ThinPlateSplines_overfit_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
#Using regularization
minimize_error_func <- function(beta, my_info){
basis_mat = my_info$basis_mat
response_vec = my_info$yobs
lambda = my_info$lambda
penalty_mat = my_info$penalty_mat
linear_pred = as.vector(basis_mat %*% as.matrix(beta))
sum_of_squares = sum((response_vec - linear_pred) ^ 2)
penalty = t(as.matrix(beta)) %*% (penalty_mat %*% as.matrix(beta))
objective = sum_of_squares + lambda * penalty
objective
}
ThinPlateSplines_fit_func_Reg <- function(df_1, data, lambda){
Smooth <- smoothCon(s(x1, k=df_1, bs = 'ts'), data = data)[[1]]
Smooth_basis <- Smooth$X
Smooth_penalty <- Smooth$S
info_list <- list(
yobs = data$y,
basis_mat = as.matrix(Smooth_basis %>% as.data.frame() %>% tbl_df()),
penalty_mat = as.matrix(Smooth_penalty %>% as.data.frame() %>% tbl_df()),
lambda = lambda
)
start_guess = rnorm(n = ncol(info_list$basis_mat), mean = 0, sd = 2)
fit <- optim(start_guess,
minimize_error_func,
gr = NULL,
info_list,
method = "BFGS",
hessian = TRUE,
control = list(fnscale = 1, maxit = 1001))
return(list("fit" = fit, "Smooth" = Smooth))
}
ThinPlateSplines_reg <- ThinPlateSplines_fit_func_Reg(25, train_df_1d, 0.0001)
#ThinPlateSplines_reg$fit
ThinPlateSplines_reg_test_basis_df <- PredictMat(ThinPlateSplines_reg$Smooth, fine_df_1d) %>% as.data.frame() %>% tbl_df()
ThinPlateSplines_reg_test_pred <- as.matrix(ThinPlateSplines_reg_test_basis_df) %*% as.matrix(ThinPlateSplines_reg$fit$par)
# ThinPlateSplines_reg_test_pred
fine_df_1d %>%
mutate(y_pred = ThinPlateSplines_reg_test_pred) %>%
mutate(y_pred_unreg = ThinPlateSplines_overfit_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_line(mapping = aes(y = y_pred_unreg), color = 'blue') +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
CubicSplines_fit_func_No_Reg <- function(df_1, data){
model <- lm(y ~ smoothCon(s(x1, k=df_1, bs = 'cc'), data = data)[[1]]$X - 1, data = data)
metrics <- glance(model) %>% mutate(df_1 = df_1)
return(metrics)
}
df_grid_cubic<- list(x1 = c(5:18)) %>% as.data.frame() %>% tbl_df()
df_grid_cubic
## # A tibble: 14 x 1
## x1
## <int>
## 1 5
## 2 6
## 3 7
## 4 8
## 5 9
## 6 10
## 7 11
## 8 12
## 9 13
## 10 14
## 11 15
## 12 16
## 13 17
## 14 18
CubicSplines_comp_fit <- map_dfr(df_grid_cubic$x1, CubicSplines_fit_func_No_Reg, data = train_df_1d)
CubicSplines_comp_fit
## # A tibble: 14 x 13
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.700 0.668 1.22 21.6 2.97e- 9 4 -64.1 138. 147.
## 2 0.800 0.772 1.01 28.7 1.26e-11 5 -55.8 124. 134.
## 3 0.882 0.862 0.785 43.5 8.39e-15 6 -45.0 104. 116.
## 4 0.926 0.910 0.632 60.4 2.52e-17 7 -35.5 87.1 101.
## 5 0.951 0.939 0.520 80.2 2.16e-19 8 -26.9 71.9 87.3
## 6 0.951 0.937 0.529 68.9 2.14e-18 9 -27.0 74.0 91.1
## 7 0.950 0.934 0.543 58.8 2.49e-17 10 -27.4 76.8 95.7
## 8 0.952 0.935 0.539 54.4 9.76e-17 11 -26.4 76.9 97.4
## 9 0.955 0.936 0.532 51.3 3.23e-16 12 -25.2 76.5 98.7
## 10 0.959 0.940 0.516 50.7 6.10e-16 13 -23.2 74.4 98.4
## 11 0.964 0.945 0.494 51.5 8.83e-16 14 -20.7 71.4 97.1
## 12 0.963 0.942 0.507 45.7 7.69e-15 15 -21.0 73.9 101.
## 13 0.965 0.942 0.507 42.8 3.43e-14 16 -20.2 74.4 104.
## 14 0.966 0.942 0.508 40.2 1.47e-13 17 -19.4 74.9 106.
## # … with 4 more variables: deviance <dbl>, df.residual <int>, nobs <int>,
## # df_1 <int>
df_best_CubicSplines_AIC <- CubicSplines_comp_fit %>% filter(AIC == min(AIC)) %>% select(df_1)
df_best_CubicSplines_AIC
## # A tibble: 1 x 1
## df_1
## <int>
## 1 15
CubicSplines_smooth_Best_AIC = smoothCon(s(x1, k=df_best_CubicSplines_AIC$df_1, bs = 'cc'), data = train_df_1d)[[1]]
CubicSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 14
## V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.00 0 0 0 -1.73e-18 0. 0. 5.42e-20
## 2 0.782 0.401 -0.0959 0.0257 -6.88e- 3 1.84e-3 -4.81e-4 8.21e- 5
## 3 0.334 0.835 -0.132 0.0353 -9.46e- 3 2.53e-3 -6.71e-4 1.51e- 4
## 4 -0.0368 0.995 0.0435 -0.0116 3.12e- 3 -8.35e-4 2.23e-4 -5.63e- 5
## 5 -0.136 0.725 0.468 -0.108 2.90e- 2 -7.77e-3 2.08e-3 -5.44e- 4
## 6 -0.0680 0.269 0.881 -0.123 3.30e- 2 -8.84e-3 2.37e-3 -6.28e- 4
## 7 0.0180 -0.0671 0.979 0.0931 -2.47e- 2 6.61e-3 -1.77e-3 4.73e- 4
## 8 0.0357 -0.133 0.664 0.535 -1.19e- 1 3.18e-2 -8.53e-3 2.28e- 3
## 9 0.0143 -0.0533 0.207 0.922 -1.10e- 1 2.94e-2 -7.88e-3 2.11e- 3
## 10 -0.00655 0.0244 -0.0912 0.955 1.48e- 1 -3.87e-2 1.04e-2 -2.78e- 3
## # … with 31 more rows, and 6 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>,
## # V12 <dbl>, V13 <dbl>, V14 <dbl>
CubicSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble() %>%
mutate(x1 = train_df_1d$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
CubicSplines_train_df <- CubicSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
CubicSplines_smooth_Best_AIC_fit <- lm(y ~ . - 1, data = CubicSplines_train_df)
CubicSplines_smooth_Best_AIC_fit %>% summary()
##
## Call:
## lm(formula = y ~ . - 1, data = CubicSplines_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.40398 -0.19270 0.03066 0.25443 1.17067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## V1 -0.6223 0.2688 -2.315 0.028439 *
## V2 -1.0236 0.3195 -3.204 0.003464 **
## V3 -3.1432 0.3202 -9.816 2.11e-10 ***
## V4 -3.2375 0.3206 -10.097 1.15e-10 ***
## V5 -2.8455 0.3207 -8.872 1.73e-09 ***
## V6 0.7397 0.3206 2.307 0.028951 *
## V7 2.2828 0.3204 7.124 1.16e-07 ***
## V8 3.2251 0.3203 10.068 1.23e-10 ***
## V9 1.8088 0.3204 5.645 5.42e-06 ***
## V10 0.1083 0.3206 0.338 0.738075
## V11 -1.8078 0.3207 -5.636 5.55e-06 ***
## V12 -1.2063 0.3206 -3.762 0.000827 ***
## V13 2.1297 0.3202 6.651 3.88e-07 ***
## V14 1.9278 0.3195 6.034 1.93e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4939 on 27 degrees of freedom
## Multiple R-squared: 0.9639, Adjusted R-squared: 0.9452
## F-statistic: 51.53 on 14 and 27 DF, p-value: 8.825e-16
coefplot::coefplot(CubicSplines_smooth_Best_AIC_fit) + theme_bw()
CubicSplines_smooth_Best_AIC_fit %>% glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.964 0.945 0.494 51.5 8.83e-16 14 -20.7 71.4 97.1
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
CubicSplines_test_basis_df <- PredictMat(CubicSplines_smooth_Best_AIC, fine_df_1d) %>% as.data.frame() %>% tbl_df()
CubicSplines_test_pred <- predict(CubicSplines_smooth_Best_AIC_fit, newdata = CubicSplines_test_basis_df)
fine_df_1d %>%
mutate(y_pred = CubicSplines_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
CubicSplines_smooth_overfit = smoothCon(s(x1, k=30, bs = 'cc'), data = train_df_1d)[[1]]
CubicSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 29
## V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.00e+0 5.55e-17 -2.78e-17 6.94e-18 0 4.34e-19 -1.08e-19 0.
## 2 3.01e-1 8.59e- 1 -1.28e- 1 3.43e- 2 -0.00919 2.46e- 3 -6.60e- 4 1.77e-4
## 3 -1.33e-1 6.64e- 1 5.35e- 1 -1.19e- 1 0.0318 -8.53e- 3 2.29e- 3 -6.13e-4
## 4 2.71e-2 -1.01e- 1 9.39e- 1 1.77e- 1 -0.0460 1.23e- 2 -3.30e- 3 8.84e-4
## 5 6.61e-3 -2.47e- 2 9.31e- 2 9.79e- 1 -0.0671 1.80e- 2 -4.81e- 3 1.29e-3
## 6 -7.34e-3 2.74e- 2 -1.02e- 1 4.34e- 1 0.754 -1.37e- 1 3.66e- 2 -9.81e-3
## 7 2.62e-3 -9.77e- 3 3.64e- 2 -1.36e- 1 0.782 4.01e- 1 -9.59e- 2 2.57e-2
## 8 -2.72e-4 1.01e- 3 -3.78e- 3 1.41e- 2 -0.0527 9.88e- 1 6.76e- 2 -1.80e-2
## 9 -2.75e-4 1.03e- 3 -3.83e- 3 1.43e- 2 -0.0533 2.07e- 1 9.22e- 1 -1.10e-1
## 10 1.70e-4 -6.36e- 4 2.37e- 3 -8.86e- 3 0.0331 -1.23e- 1 5.68e- 1 6.33e-1
## # … with 31 more rows, and 21 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>,
## # V12 <dbl>, V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>,
## # V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>,
## # V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>
CubicSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble() %>%
mutate(x1 = train_df_1d$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
CubicSplines_overfit_train_df <- CubicSplines_smooth_overfit$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
CubicSplines_smooth_overfit_fit <- lm(y ~ . - 1, data = CubicSplines_overfit_train_df)
CubicSplines_smooth_overfit_fit %>% summary()
##
## Call:
## lm(formula = y ~ . - 1, data = CubicSplines_overfit_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28031 -0.16479 0.00262 0.14720 1.29435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## V1 -0.7459 0.4305 -1.733 0.108716
## V2 -0.5421 0.5904 -0.918 0.376613
## V3 -0.8982 0.5900 -1.522 0.153857
## V4 -2.4369 0.5887 -4.140 0.001372 **
## V5 -2.5162 0.5937 -4.238 0.001152 **
## V6 -3.6199 0.5884 -6.152 4.93e-05 ***
## V7 -3.2820 0.5918 -5.545 0.000127 ***
## V8 -3.1460 0.5929 -5.306 0.000186 ***
## V9 -3.1827 0.5875 -5.417 0.000156 ***
## V10 -1.8287 0.5936 -3.081 0.009520 **
## V11 0.3431 0.5905 0.581 0.571998
## V12 1.1930 0.5898 2.023 0.065974 .
## V13 2.3816 0.5937 4.011 0.001727 **
## V14 2.3285 0.5879 3.961 0.001890 **
## V15 3.4232 0.5924 5.778 8.76e-05 ***
## V16 2.8749 0.5924 4.853 0.000396 ***
## V17 2.4742 0.5879 4.209 0.001213 **
## V18 1.3457 0.5937 2.266 0.042715 *
## V19 0.7628 0.5898 1.293 0.220262
## V20 -0.4784 0.5905 -0.810 0.433659
## V21 -0.8730 0.5936 -1.471 0.167087
## V22 -2.2080 0.5875 -3.758 0.002729 **
## V23 -1.9631 0.5929 -3.311 0.006213 **
## V24 -0.5647 0.5918 -0.954 0.358875
## V25 0.4873 0.5884 0.828 0.423691
## V26 2.3337 0.5937 3.931 0.001997 **
## V27 2.7928 0.5887 4.744 0.000477 ***
## V28 1.6950 0.5900 2.873 0.014017 *
## V29 0.5219 0.5904 0.884 0.394070
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6327 on 12 degrees of freedom
## Multiple R-squared: 0.9737, Adjusted R-squared: 0.9101
## F-statistic: 15.31 on 29 and 12 DF, p-value: 7.934e-06
coefplot::coefplot(CubicSplines_smooth_overfit_fit) + theme_bw()
CubicSplines_smooth_overfit_fit %>% glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.974 0.910 0.633 15.3 7.93e-6 29 -14.2 88.4 140.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
CubicSplines_overfit_test_basis_df <- PredictMat(CubicSplines_smooth_overfit, fine_df_1d) %>% as.data.frame() %>% tbl_df()
CubicSplines_overfit_test_pred <- predict(CubicSplines_smooth_overfit_fit, newdata = CubicSplines_overfit_test_basis_df)
fine_df_1d %>%
mutate(y_pred = CubicSplines_overfit_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
#Using regularization
CubicSplines_fit_func_Reg <- function(df_1, data, lambda){
Smooth <- smoothCon(s(x1, k=df_1, bs = 'cc'), data = data)[[1]]
Smooth_basis <- Smooth$X
Smooth_penalty <- Smooth$S
info_list <- list(
yobs = data$y,
basis_mat = as.matrix(Smooth_basis %>% as.data.frame() %>% tbl_df()),
penalty_mat = as.matrix(Smooth_penalty %>% as.data.frame() %>% tbl_df()),
lambda = lambda
)
start_guess = rnorm(n = ncol(info_list$basis_mat), mean = 0, sd = 2)
fit <- optim(start_guess,
minimize_error_func,
gr = NULL,
info_list,
method = "BFGS",
hessian = TRUE,
control = list(fnscale = 1, maxit = 1001))
return(list("fit" = fit, "Smooth" = Smooth))
}
CubicSplines_reg <- CubicSplines_fit_func_Reg(30, train_df_1d, 2.0)
CubicSplines_reg_test_basis_df <- PredictMat(CubicSplines_reg$Smooth, fine_df_1d) %>% as.data.frame() %>% tbl_df()
CubicSplines_reg_test_pred <- as.matrix(CubicSplines_reg_test_basis_df) %*% as.matrix(CubicSplines_reg$fit$par)
fine_df_1d %>%
mutate(y_pred = CubicSplines_reg_test_pred) %>%
mutate(y_pred_unreg = CubicSplines_overfit_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_line(mapping = aes(y = y_pred_unreg), color = 'blue') +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
TensorSplines_fit_func_No_Reg <- function(df_1, data){
model <- lm(y ~ smoothCon(ti(x1, k=df_1), data = data)[[1]]$X - 1, data = data)
metrics <- glance(model) %>% mutate(df_1 = df_1)
return(metrics)
}
df_grid_tensor<- list(x1 = c(5:18)) %>% as.data.frame() %>% tbl_df()
df_grid_tensor
## # A tibble: 14 x 1
## x1
## <int>
## 1 5
## 2 6
## 3 7
## 4 8
## 5 9
## 6 10
## 7 11
## 8 12
## 9 13
## 10 14
## 11 15
## 12 16
## 13 17
## 14 18
TensorSplines_comp_fit <- map_dfr(df_grid_tensor$x1, TensorSplines_fit_func_No_Reg, data = train_df_1d)
TensorSplines_comp_fit
## # A tibble: 14 x 13
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.648 0.610 1.32 17.1 5.20e- 8 4 -67.4 145. 153.
## 2 0.712 0.672 1.21 17.8 7.43e- 9 5 -63.3 139. 149.
## 3 0.873 0.851 0.814 40.1 2.87e-14 6 -46.5 107. 119.
## 4 0.939 0.926 0.574 74.2 1.00e-18 7 -31.6 79.2 92.9
## 5 0.970 0.963 0.405 135. 6.08e-23 8 -16.7 51.4 66.8
## 6 0.972 0.964 0.399 124. 2.82e-22 9 -15.5 50.9 68.1
## 7 0.972 0.963 0.407 107. 3.63e-21 10 -15.6 53.2 72.1
## 8 0.973 0.964 0.402 99.8 1.67e-20 11 -14.5 52.9 73.5
## 9 0.974 0.963 0.405 90.2 1.30e-19 12 -14.0 54.1 76.4
## 10 0.975 0.964 0.401 85.4 5.67e-19 13 -12.9 53.7 77.7
## 11 0.978 0.967 0.383 87.0 9.89e-19 14 -10.3 50.5 76.2
## 12 0.977 0.964 0.400 74.3 1.81e-17 15 -11.3 54.6 82.0
## 13 0.978 0.964 0.398 70.6 8.51e-17 16 -10.2 54.5 83.6
## 14 0.980 0.965 0.395 67.5 3.78e-16 17 -9.08 54.2 85.0
## # … with 4 more variables: deviance <dbl>, df.residual <int>, nobs <int>,
## # df_1 <int>
df_best_TensorSplines_AIC <- TensorSplines_comp_fit %>% filter(AIC == min(AIC)) %>% select(df_1)
df_best_TensorSplines_AIC
## # A tibble: 1 x 1
## df_1
## <int>
## 1 15
TensorSplines_smooth_Best_AIC = smoothCon(ti(x1, k=df_best_TensorSplines_AIC$df_1), data = train_df_1d)[[1]]
TensorSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 14
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.298 -0.258 -0.269 -0.266 -0.267 -0.266 -0.267 -0.266 -0.267
## 2 0.333 -0.309 -0.148 -0.191 -0.180 -0.183 -0.182 -0.182 -0.182
## 3 0.794 -0.261 -0.0701 -0.121 -0.107 -0.111 -0.110 -0.110 -0.110
## 4 0.913 -0.0160 -0.0770 -0.0607 -0.0651 -0.0639 -0.0642 -0.0641 -0.0642
## 5 0.633 0.430 -0.161 -0.0195 -0.0573 -0.0472 -0.0499 -0.0492 -0.0494
## 6 0.191 0.835 -0.178 -0.0200 -0.0624 -0.0511 -0.0541 -0.0533 -0.0535
## 7 -0.131 0.919 0.0316 -0.0860 -0.0547 -0.0631 -0.0609 -0.0614 -0.0613
## 8 -0.195 0.600 0.471 -0.183 -0.0320 -0.0724 -0.0616 -0.0644 -0.0637
## 9 -0.120 0.145 0.859 -0.173 -0.0335 -0.0708 -0.0608 -0.0635 -0.0628
## 10 -0.0461 -0.150 0.892 0.0865 -0.100 -0.0511 -0.0643 -0.0608 -0.0618
## # … with 31 more rows, and 5 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>,
## # V13 <dbl>, V14 <dbl>
TensorSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble() %>%
mutate(x1 = train_df_1d$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
TensorSplines_train_df <- TensorSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
TensorSplines_smooth_Best_AIC_fit <- lm(y ~ . - 1, data = TensorSplines_train_df)
TensorSplines_smooth_Best_AIC_fit %>% summary()
##
## Call:
## lm(formula = y ~ . - 1, data = TensorSplines_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57797 -0.32042 -0.15595 0.09521 0.44434
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## V1 -1.2384 0.2717 -4.557 0.000100 ***
## V2 -2.9956 0.2451 -12.220 1.64e-12 ***
## V3 -3.2860 0.2605 -12.613 7.87e-13 ***
## V4 -2.7893 0.2538 -10.991 1.80e-11 ***
## V5 0.7408 0.2575 2.876 0.007762 **
## V6 2.3118 0.2554 9.050 1.15e-09 ***
## V7 3.2420 0.2564 12.642 7.46e-13 ***
## V8 1.8272 0.2559 7.139 1.12e-07 ***
## V9 0.1353 0.2564 0.528 0.601943
## V10 -1.8030 0.2561 -7.040 1.44e-07 ***
## V11 -1.1571 0.2561 -4.519 0.000111 ***
## V12 2.0935 0.2535 8.259 7.24e-09 ***
## V13 2.0753 0.2538 8.177 8.80e-09 ***
## V14 -1.8283 0.3359 -5.444 9.27e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3829 on 27 degrees of freedom
## Multiple R-squared: 0.9783, Adjusted R-squared: 0.9671
## F-statistic: 87.04 on 14 and 27 DF, p-value: < 2.2e-16
coefplot::coefplot(TensorSplines_smooth_Best_AIC_fit) + theme_bw()
TensorSplines_smooth_Best_AIC_fit %>% glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.978 0.967 0.383 87.0 9.89e-19 14 -10.3 50.5 76.2
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
TensorSplines_test_basis_df <- PredictMat(TensorSplines_smooth_Best_AIC, fine_df_1d) %>% as.data.frame() %>% tbl_df()
TensorSplines_test_pred <- predict(TensorSplines_smooth_Best_AIC_fit, newdata = TensorSplines_test_basis_df)
fine_df_1d %>%
mutate(y_pred = TensorSplines_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
TensorSplines_smooth_overfit = smoothCon(ti(x1, k=30), data = train_df_1d)[[1]]
TensorSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 29
## V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.195 -0.181 -0.186 -0.182 -0.186 -0.184 -0.183 -1.86e-1
## 2 0.871 -0.207 -0.0205 -0.0695 -0.0571 -0.0602 -0.0590 -6.01e-2
## 3 0.609 0.526 -0.140 0.0144 -0.0272 -0.0160 -0.0189 -1.84e-2
## 4 -0.127 0.906 0.146 -0.0770 -0.0192 -0.0345 -0.0302 -3.18e-2
## 5 -0.0551 0.0629 0.949 -0.0971 -0.0126 -0.0351 -0.0288 -3.10e-2
## 6 -0.00510 -0.130 0.405 0.726 -0.166 0.00791 -0.0384 -2.64e-2
## 7 -0.0403 0.00733 -0.166 0.753 0.371 -0.125 -0.00360 -3.66e-2
## 8 -0.0306 -0.0329 -0.0159 -0.0821 0.958 0.0380 -0.0475 -2.52e-2
## 9 -0.0303 -0.0328 -0.0155 -0.0825 0.177 0.892 -0.139 -3.61e-4
## 10 -0.0316 -0.0264 -0.0384 0.00415 -0.153 0.539 0.604 -1.60e-1
## # … with 31 more rows, and 21 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>,
## # V12 <dbl>, V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>,
## # V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>,
## # V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>
TensorSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble() %>%
mutate(x1 = train_df_1d$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
TensorSplines_overfit_train_df <- TensorSplines_smooth_overfit$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
TensorSplines_smooth_overfit_fit <- lm(y ~ . - 1, data = TensorSplines_overfit_train_df)
TensorSplines_smooth_overfit_fit %>% summary()
##
## Call:
## lm(formula = y ~ . - 1, data = TensorSplines_overfit_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52981 -0.26919 -0.12846 -0.01008 0.22824
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## V1 -0.6999 0.3805 -1.839 0.090740 .
## V2 -0.7736 0.3786 -2.043 0.063622 .
## V3 -2.4719 0.3873 -6.382 3.49e-05 ***
## V4 -2.4624 0.3876 -6.353 3.65e-05 ***
## V5 -3.6148 0.3865 -9.353 7.34e-07 ***
## V6 -3.2511 0.3877 -8.386 2.31e-06 ***
## V7 -3.1281 0.3888 -8.045 3.55e-06 ***
## V8 -3.1593 0.3853 -8.200 2.92e-06 ***
## V9 -1.8072 0.3891 -4.644 0.000566 ***
## V10 0.3655 0.3872 0.944 0.363856
## V11 1.2140 0.3868 3.139 0.008553 **
## V12 2.4048 0.3892 6.179 4.73e-05 ***
## V13 2.3493 0.3856 6.093 5.40e-05 ***
## V14 3.4457 0.3884 8.872 1.28e-06 ***
## V15 2.8972 0.3884 7.460 7.64e-06 ***
## V16 2.4953 0.3856 6.471 3.06e-05 ***
## V17 1.3685 0.3892 3.516 0.004252 **
## V18 0.7844 0.3868 2.028 0.065355 .
## V19 -0.4571 0.3872 -1.180 0.260737
## V20 -0.8494 0.3891 -2.183 0.049611 *
## V21 -2.1884 0.3854 -5.679 0.000103 ***
## V22 -1.9381 0.3887 -4.987 0.000316 ***
## V23 -0.5470 0.3880 -1.410 0.184038
## V24 0.5165 0.3858 1.339 0.205501
## V25 2.3413 0.3888 6.021 6.02e-05 ***
## V26 2.8418 0.3850 7.380 8.50e-06 ***
## V27 1.6678 0.3826 4.359 0.000930 ***
## V28 0.6291 0.3729 1.687 0.117384
## V29 -2.0131 0.4079 -4.936 0.000345 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.413 on 12 degrees of freedom
## Multiple R-squared: 0.9888, Adjusted R-squared: 0.9617
## F-statistic: 36.5 on 29 and 12 DF, p-value: 5.667e-08
coefplot::coefplot(TensorSplines_smooth_overfit_fit) + theme_bw()
TensorSplines_smooth_overfit_fit %>% glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.989 0.962 0.413 36.5 5.67e-8 29 3.27 53.5 105.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
TensorSplines_overfit_test_basis_df <- PredictMat(TensorSplines_smooth_overfit, fine_df_1d) %>% as.data.frame() %>% tbl_df()
TensorSplines_overfit_test_pred <- predict(TensorSplines_smooth_overfit_fit, newdata = TensorSplines_overfit_test_basis_df)
fine_df_1d %>%
mutate(y_pred = TensorSplines_overfit_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
#Using regularization
TensorSplines_fit_func_Reg <- function(df_1, data, lambda){
Smooth <- smoothCon(ti(x1, k=df_1), data = data)[[1]]
Smooth_basis <- Smooth$X
Smooth_penalty <- Smooth$S
info_list <- list(
yobs = data$y,
basis_mat = as.matrix(Smooth_basis %>% as.data.frame() %>% tbl_df()),
penalty_mat = as.matrix(Smooth_penalty %>% as.data.frame() %>% tbl_df()),
lambda = lambda
)
start_guess = rnorm(n = ncol(info_list$basis_mat), mean = 0, sd = 2)
fit <- optim(start_guess,
minimize_error_func,
gr = NULL,
info_list,
method = "BFGS",
hessian = TRUE,
control = list(fnscale = 1, maxit = 1001))
return(list("fit" = fit, "Smooth" = Smooth))
}
TensorSplines_reg <- TensorSplines_fit_func_Reg(30, train_df_1d, 1.0)
TensorSplines_reg_test_basis_df <- PredictMat(TensorSplines_reg$Smooth, fine_df_1d) %>% as.data.frame() %>% tbl_df()
TensorSplines_reg_test_pred <- as.matrix(TensorSplines_reg_test_basis_df) %*% as.matrix(TensorSplines_reg$fit$par)
fine_df_1d %>%
mutate(y_pred = TensorSplines_reg_test_pred) %>%
mutate(y_pred_unreg = TensorSplines_overfit_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_line(mapping = aes(y = y_pred_unreg), color = 'blue') +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
PSplines_fit_func_No_Reg <- function(df_1, data){
model <- lm(y ~ smoothCon(s(x1, k=df_1, bs = 'ps'), data = data)[[1]]$X - 1, data = data)
metrics <- glance(model) %>% mutate(df_1 = df_1)
return(metrics)
}
df_grid_psplines<- list(x1 = c(5:18)) %>% as.data.frame() %>% tbl_df()
df_grid_psplines
## # A tibble: 14 x 1
## x1
## <int>
## 1 5
## 2 6
## 3 7
## 4 8
## 5 9
## 6 10
## 7 11
## 8 12
## 9 13
## 10 14
## 11 15
## 12 16
## 13 17
## 14 18
PSplines_comp_fit <- map_dfr(df_grid_psplines$x1, PSplines_fit_func_No_Reg, data = train_df_1d)
PSplines_comp_fit
## # A tibble: 14 x 13
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.522 0.455 1.56 7.85 4.43e- 5 5 -73.7 159. 170.
## 2 0.593 0.524 1.46 8.51 9.95e- 6 6 -70.4 155. 167.
## 3 0.966 0.959 0.426 139. 3.96e-23 7 -19.3 54.6 68.3
## 4 0.948 0.936 0.534 75.8 5.25e-19 8 -28.0 74.1 89.5
## 5 0.952 0.939 0.523 70.7 1.43e-18 9 -26.5 73.0 90.1
## 6 0.964 0.953 0.457 84.1 1.30e-19 10 -20.4 62.8 81.6
## 7 0.977 0.968 0.377 114. 2.45e-21 11 -11.8 47.6 68.2
## 8 0.976 0.966 0.389 97.9 4.14e-20 12 -12.4 50.8 73.1
## 9 0.976 0.965 0.397 86.8 4.52e-19 13 -12.5 53.0 77.0
## 10 0.977 0.966 0.392 83.0 1.84e-18 14 -11.2 52.4 78.1
## 11 0.978 0.965 0.395 76.2 1.31e-17 15 -10.8 53.5 81.0
## 12 0.980 0.967 0.385 75.5 3.77e-17 16 -8.88 51.8 80.9
## 13 0.983 0.971 0.357 82.9 3.46e-17 17 -4.95 45.9 76.7
## 14 0.982 0.969 0.373 71.5 5.61e-16 18 -5.94 49.9 82.4
## # … with 4 more variables: deviance <dbl>, df.residual <int>, nobs <int>,
## # df_1 <int>
df_best_PSplines_AIC <- PSplines_comp_fit %>% filter(AIC == min(AIC)) %>% select(df_1)
df_best_PSplines_AIC
## # A tibble: 1 x 1
## df_1
## <int>
## 1 17
PSplines_smooth_Best_AIC = smoothCon(s(x1, k=df_best_PSplines_AIC$df_1, bs = 'ps'), data = train_df_1d)[[1]]
PSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 17
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.160 0.666 0.174 4.55e-7 0. 0. 0. 0 0 0
## 2 0.0430 0.559 0.390 7.99e-3 0. 0. 0. 0 0 0
## 3 0.00396 0.340 0.596 6.03e-2 0. 0. 0. 0 0 0
## 4 0 0.138 0.663 1.99e-1 3.95e-5 0. 0. 0 0 0
## 5 0 0.0340 0.532 4.22e-1 1.16e-2 0. 0. 0 0 0
## 6 0 0.00229 0.308 6.16e-1 7.33e-2 0. 0. 0 0 0
## 7 0 0 0.118 6.55e-1 2.27e-1 2.21e-4 0. 0 0 0
## 8 0 0 0.0264 5.04e-1 4.53e-1 1.61e-2 0. 0 0 0
## 9 0 0 0.00117 2.77e-1 6.33e-1 8.80e-2 0. 0 0 0
## 10 0 0 0 9.96e-2 6.44e-1 2.56e-1 6.53e-4 0 0 0
## # … with 31 more rows, and 7 more variables: V11 <dbl>, V12 <dbl>, V13 <dbl>,
## # V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>
PSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tibble::as_tibble() %>%
mutate(x1 = train_df_1d$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
PSplines_train_df <- PSplines_smooth_Best_AIC$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
PSplines_smooth_Best_AIC_fit <- lm(y ~ . - 1, data = PSplines_train_df)
PSplines_smooth_Best_AIC_fit %>% summary()
##
## Call:
## lm(formula = y ~ . - 1, data = PSplines_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56602 -0.16323 0.02515 0.21155 0.55547
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## V1 7.2483 5.3878 1.345 0.191101
## V2 -0.8174 1.2022 -0.680 0.503044
## V3 -0.5579 0.6497 -0.859 0.399076
## V4 -3.7147 0.5208 -7.132 2.26e-07 ***
## V5 -3.0047 0.4850 -6.195 2.11e-06 ***
## V6 -3.8905 0.4746 -8.198 2.04e-08 ***
## V7 1.5287 0.4712 3.244 0.003453 **
## V8 2.0734 0.4700 4.411 0.000186 ***
## V9 3.9005 0.4696 8.305 1.62e-08 ***
## V10 1.6545 0.4700 3.520 0.001752 **
## V11 0.3186 0.4712 0.676 0.505457
## V12 -2.3176 0.4746 -4.884 5.58e-05 ***
## V13 -1.9595 0.4850 -4.040 0.000476 ***
## V14 3.0821 0.5208 5.918 4.17e-06 ***
## V15 2.4674 0.6497 3.797 0.000878 ***
## V16 -1.1288 1.2022 -0.939 0.357113
## V17 -10.5723 5.3878 -1.962 0.061428 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3569 on 24 degrees of freedom
## Multiple R-squared: 0.9833, Adjusted R-squared: 0.9714
## F-statistic: 82.93 on 17 and 24 DF, p-value: < 2.2e-16
coefplot::coefplot(PSplines_smooth_Best_AIC_fit) + theme_bw()
PSplines_smooth_Best_AIC_fit %>% glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.983 0.971 0.357 82.9 3.46e-17 17 -4.95 45.9 76.7
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
PSplines_test_basis_df <- PredictMat(PSplines_smooth_Best_AIC, fine_df_1d) %>% as.data.frame() %>% tbl_df()
PSplines_test_pred <- predict(PSplines_smooth_Best_AIC_fit, newdata = PSplines_test_basis_df)
fine_df_1d %>%
mutate(y_pred = PSplines_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
PSplines_smooth_overfit = smoothCon(s(x1, k=30, bs = 'ps'), data = train_df_1d)[[1]]
PSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble()
## # A tibble: 41 x 30
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.154 0.666 0.180 3.26e-6 0 0. 0 0. 0 0.
## 2 0.00447 0.348 0.590 5.73e-2 0 0. 0 0. 0 0.
## 3 0 0.0408 0.553 3.98e-1 0.00874 0. 0 0. 0 0.
## 4 0 0 0.144 6.64e-1 0.192 1.83e-5 0 0. 0 0.
## 5 0 0 0.00360 3.34e-1 0.600 6.26e-2 0 0. 0 0.
## 6 0 0 0 3.69e-2 0.541 4.12e-1 0.0103 0. 0 0.
## 7 0 0 0 0. 0.135 6.62e-1 0.203 5.44e-5 0 0.
## 8 0 0 0 0. 0.00285 3.20e-1 0.609 6.82e-2 0 0.
## 9 0 0 0 0. 0 3.32e-2 0.530 4.25e-1 0.0120 0.
## 10 0 0 0 0. 0 0. 0.126 6.59e-1 0.215 1.21e-4
## # … with 31 more rows, and 20 more variables: V11 <dbl>, V12 <dbl>, V13 <dbl>,
## # V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>,
## # V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>,
## # V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>
PSplines_smooth_overfit$X %>% as.data.frame() %>% tibble::as_tibble() %>%
mutate(x1 = train_df_1d$x1) %>%
tibble::rowid_to_column() %>%
tidyr::gather(key = "key", value = "value", -rowid, -x1) %>%
ggplot(mapping = aes(x = x1, y = value)) +
geom_line(mapping = aes(color = key),
size = 1.15) +
theme_bw() +
theme(legend.position = "top")
PSplines_overfit_train_df <- PSplines_smooth_overfit$X %>% as.data.frame() %>% tbl_df() %>% mutate(y = train_df_1d$y)
PSplines_smooth_overfit_fit <- lm(y ~ . - 1, data = PSplines_overfit_train_df)
PSplines_smooth_overfit_fit %>% summary()
##
## Call:
## lm(formula = y ~ . - 1, data = PSplines_overfit_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38928 -0.15212 -0.00088 0.14416 0.36534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## V1 6.13570 9.18901 0.668 0.518075
## V2 -0.42201 2.29614 -0.184 0.857522
## V3 -0.62361 1.12009 -0.557 0.588849
## V4 -0.67375 0.79853 -0.844 0.416787
## V5 -3.09608 0.70974 -4.362 0.001132 **
## V6 -2.15893 0.67808 -3.184 0.008701 **
## V7 -4.56984 0.67359 -6.784 3.02e-05 ***
## V8 -2.34446 0.66831 -3.508 0.004900 **
## V9 -3.78469 0.67053 -5.644 0.000150 ***
## V10 -2.86549 0.66807 -4.289 0.001279 **
## V11 0.02942 0.67016 0.044 0.965770
## V12 0.81295 0.66859 1.216 0.249468
## V13 2.66214 0.66990 3.974 0.002181 **
## V14 1.83883 0.66912 2.748 0.018953 *
## V15 3.90480 0.66956 5.832 0.000114 ***
## V16 2.68855 0.66956 4.015 0.002032 **
## V17 2.63589 0.66912 3.939 0.002315 **
## V18 0.99455 0.66990 1.485 0.165728
## V19 0.61642 0.66859 0.922 0.376327
## V20 -0.78277 0.67016 -1.168 0.267483
## V21 -1.20588 0.66807 -1.805 0.098487 .
## V22 -2.92490 0.67053 -4.362 0.001132 **
## V23 -1.14061 0.66831 -1.707 0.115912
## V24 0.20689 0.67359 0.307 0.764472
## V25 1.44574 0.67808 2.132 0.056374 .
## V26 4.00722 0.70974 5.646 0.000150 ***
## V27 0.99627 0.79853 1.248 0.238073
## V28 1.86743 1.12009 1.667 0.123659
## V29 -3.49694 2.29614 -1.523 0.155984
## V30 -0.22387 9.18901 -0.024 0.981000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3656 on 11 degrees of freedom
## Multiple R-squared: 0.9919, Adjusted R-squared: 0.97
## F-statistic: 45.15 on 30 and 11 DF, p-value: 5.957e-08
coefplot::coefplot(PSplines_smooth_overfit_fit) + theme_bw()
PSplines_smooth_overfit_fit %>% glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.992 0.970 0.366 45.2 5.96e-8 30 10.0 41.9 95.0
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
PSplines_overfit_test_basis_df <- PredictMat(PSplines_smooth_overfit, fine_df_1d) %>% as.data.frame() %>% tbl_df()
PSplines_overfit_test_pred <- predict(PSplines_smooth_overfit_fit, newdata = PSplines_overfit_test_basis_df)
fine_df_1d %>%
mutate(y_pred = PSplines_overfit_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()
#Using regularization
PSplines_fit_func_Reg <- function(df_1, data, lambda){
Smooth <- smoothCon(s(x1, k=df_1, bs = 'ps'), data = data)[[1]]
Smooth_basis <- Smooth$X
Smooth_penalty <- Smooth$S
info_list <- list(
yobs = data$y,
basis_mat = as.matrix(Smooth_basis %>% as.data.frame() %>% tbl_df()),
penalty_mat = as.matrix(Smooth_penalty %>% as.data.frame() %>% tbl_df()),
lambda = lambda
)
start_guess = rnorm(n = ncol(info_list$basis_mat), mean = 0, sd = 2)
fit <- optim(start_guess,
minimize_error_func,
gr = NULL,
info_list,
method = "BFGS",
hessian = TRUE,
control = list(fnscale = 1, maxit = 1001))
return(list("fit" = fit, "Smooth" = Smooth))
}
PSplines_reg <- PSplines_fit_func_Reg(30, train_df_1d, 1.5)
PSplines_reg_test_basis_df <- PredictMat(PSplines_reg$Smooth, fine_df_1d) %>% as.data.frame() %>% tbl_df()
PSplines_reg_test_pred <- as.matrix(PSplines_reg_test_basis_df) %*% as.matrix(PSplines_reg$fit$par)
fine_df_1d %>%
mutate(y_pred = PSplines_reg_test_pred) %>%
mutate(y_pred_unreg = PSplines_overfit_test_pred) %>%
ggplot(mapping = aes(x = x1, y = y_pred)) +
geom_line(size = 1.15) +
geom_line(mapping = aes(y = y_pred_unreg), color = 'blue') +
geom_point(mapping = aes(y = y), color = "red") +
theme_bw()